/* This do-file appends the waves of the General Household Survey from 2002 to 2011, 
and extracts the relevant variables for the replication of Figure 2, 3, and 4. Please 
refer to the text of the paper for a description of this dataset */ 

set more off 

****************************
* General household survey * 
****************************

cd "" /*input directory here with raw data */

* Merge Person and Worker file (individual level) 2002-2008 * 

forvalues i=2002 (1) 2008  { 
use GHS_Person_`i'_rwt2013_F1.dta, clear
sort UqNr PersonNr year
merge UqNr PersonNr year using  GHS_Worker_`i'_rwt2013_F1.dta
drop _merge
save GHS_`i'_rwt2013_F1.dta,replace 
}

* Merge Child Grant Information from 2002 Household Module * 

use "GHS_House_2002_F1.dta", clear
keep UqNr Q443Chld  
sort UqNr
save "GHS_House_2002.dta", replace 


* Append GHS waves (individual level) 2002-2011 * 

use "GHS_2002_rwt2013_F1.dta", clear

sort UqNr

merge UqNr using "GHS_House_2002.dta"

append using "GHS_2003_rwt2013_F1.dta", force
append using "GHS_2004_rwt2013_F1.dta", force
append using "GHS_2005_rwt2013_F1.dta", force
append using "GHS_2006_rwt2013_F1.dta", force
append using "GHS_2007_rwt2013_F1.dta", force
append using "GHS_2008_rwt2013_F1.dta", force
append using "GHS_Person_2009_rwt2013_F1.dta", force
append using "GHS_Person_2010_rwt2013_F1.dta", force
append using "GHS_Person_2011_rwt2013_F1.dta", force


****************************
* Define Relevant Variables* 
****************************

** Age ** 
 
replace age=Age if year!=2006 
drop Age

** Child Grant **

/* NB: in 2002, child grant is coded only at the household level, in the household module. As from 2003, for each individual child of 
eligibile age, the respondent is asked whether a child grant is received for him/her */ 

gen child_grant=Q443Chld if year==2002 /* in 2002, Child grant is measured only at the household level, not coded for individual recipients */ 
replace child_grant=Q138chil if year==2003 
replace child_grant=Q150Chil if year==2004
replace child_grant=Q137chil if year==2005
replace child_grant=Q133chil if year==2006 | year==2007 | year==2008
replace child_grant=Q136bcsg if year==2009 | year==2010
replace child_grant=Q132bcsg if year==2011 | year==2012

recode child_grant (3=.) (8=.) (9=.) (2=0)
replace child_grant=0 if child_grant==. 

label define grant 1 "Yes" 0 " No" 
label values child_grant grant 

** Household Id ** 

destring UqNr, force replace 
rename UqNr household_id 

** Sample weight **

gen weight= Person_wgt 
replace weight=person_wgt if weight==. 

** Person id ** 

destring PersonNr PersonNR, replace 

gen id=PersonNr 
replace id=PersonNR if id==. 

** Province ** 

rename Prov province

** Gender ** 

replace Gender=gender if year==2006
rename Gender sex

** Race ** 

replace race=Race if year!=2006

** Marital Status ** 

gen married=(Q12Maris==1) if year==2002
replace married=(Q12amari==1) if year>=2003 & year<=2004 
replace married=(Q12amari==1 | Q12amari==2) if year>=2005 & year<=2008 /* Careful that the question changes between 2004 and 2005 */ 
replace married=(Q12amarst==1 | Q12amarst==2) if year>=2009 & year<=2011
replace married=0 if married==.

** Spouse and mother ID ** 

gen mother_id=Q14mpsnn if year==2002
replace mother_id=Q14cmpsn if year>=2003 & year<=2011

recode mother_id (88=.) (99=.) (92=.) (93=.)



****************************
*     Save Cleaned Data    * 
****************************


keep id age year sex race province child_grant household_id weight  married mother_id     

order id age year sex race province child_grant household_id weight  married mother_id     

sort year household_id id

duplicates drop household_id id year, force 

save south_africa_GHS.dta, replace 
